﻿using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading;

namespace NetworkData.Spider
{
    /// <summary>
    /// Perform all of the work of a single thread for the spider.
    /// This involves waiting for a URL to becomve available, download
    /// and then processing the page.
    /// 
    /// </summary>
    // 完成必须由单个工作线程执行的操作，包括
    // 等待可用的URL，下载和处理页面
    public class DocumentWorker
    {
        /// <summary>
        /// The base URI that is to be spidered.
        /// </summary>
        // 要扫描的基础URI
        private Uri _mUri;

        /// <summary>
        /// The spider that this thread "works for"
        /// </summary>
        // 
        private readonly Spider _mSpider;

        /// <summary>
        /// The thread that is being used.
        /// </summary>
        private Thread _mThread;


        /// <summary>
        /// The name for default documents.
        /// </summary>
        // 缺省文档的名字
        public const string IndexFile = "index.html";

        /// <summary>
        /// Constructor.
        /// </summary>
        /// <param name="spider">The spider that owns this worker.</param>
        // 构造函数，参数表示拥有当前工作线程的蜘蛛程序
        public DocumentWorker(Spider spider)
        {
            _mSpider = spider;
        }

        /// <summary>
        /// This method will take a URI name, such ash /images/blank.gif
        /// and convert it into the name of a file for local storage.
        /// If the directory structure to hold this file does not exist, it
        /// will be created by this method.
        /// </summary>
        /// <param name="uri">The URI of the file about to be stored</param>
        /// <returns></returns>
        // 输入参数是一个URI名称，例如/images/blank.gif.
        // 把它转换成本地文件名称。如果尚未创建相应的目录
        // 结构，则创建之
        private string ConvertFilename(Uri uri)
        {
            string result = _mSpider.OutputPath;
            int index2;

            // add ending slash if needed
            if (result[result.Length - 1] != '\\')
                result = result + "\\";

            // strip the query if needed

            String path = uri.PathAndQuery;
            int queryIndex = path.IndexOf("?");
            if (queryIndex != -1)
                path = path.Substring(0, queryIndex);

            // see if an ending / is missing from a directory only

            int lastSlash = path.LastIndexOf('/');
            int lastDot = path.LastIndexOf('.');

            if (path[path.Length - 1] != '/')
            {
                if (lastSlash > lastDot)
                    path += "/" + IndexFile;
            }

            // determine actual filename		
            lastSlash = path.LastIndexOf('/');

            string filename = "";
            if (lastSlash != -1)
            {
                filename = path.Substring(1 + lastSlash);
                path = path.Substring(0, 1 + lastSlash);
                if (filename.Equals(""))
                    filename = IndexFile;
            }

            // 必要时创建目录结构			
            int index1 = 1;
            do
            {
                index2 = path.IndexOf('/', index1);
                if (index2 != -1)
                {
                    String dirpart = path.Substring(index1, index2 - index1);
                    result += dirpart;
                    result += "\\";


                    Directory.CreateDirectory(result);

                    index1 = index2 + 1;
                }
            } while (index2 != -1);

            // attach name
            result += filename;

            return result;
        }

        /// <summary>
        /// Save a binary file to disk.
        /// </summary>
        /// <param name="response">The response used to save the file</param>
        // 将二进制文件保存到磁盘
        private void SaveBinaryFile(WebResponse response)
        {
            var buffer = new byte[1024];

            if (_mSpider.OutputPath == null)
                return;

            string filename = ConvertFilename(response.ResponseUri);
            Stream outStream = File.Create(filename);
            Stream inStream = response.GetResponseStream();

            int l = 0;
            do
            {
                if (inStream != null) l = inStream.Read(buffer, 0, buffer.Length);
                if (l > 0)
                    outStream.Write(buffer, 0, l);
            } while (l > 0);

            outStream.Close();
            if (inStream != null) inStream.Close();
        }

        /// <summary>
        /// Save a text file.
        /// </summary>
        /// <param name="buffer">The text to save</param>
        // 保存文本文件
        private void SaveTextFile(string buffer)
        {
            string filename = "";
            try
            {
                if (_mSpider.OutputPath == null)
                    return;

                filename = ConvertFilename(_mUri);
                var outStream = new StreamWriter(filename);
                outStream.Write(buffer);
                outStream.Close();
            }
            catch (Exception e)
            {
                Console.WriteLine("Save URI:" + _mUri + "to file " + filename + " Error:" + e.Message);

            }

        }

        /// <summary>
        /// Download a page
        /// </summary>
        /// <returns>The data downloaded from the page</returns>
        // 下载一个页面
        private string GetPage()
        {
            WebResponse response = null;
            Stream stream = null;
            StreamReader reader = null;

            try
            {
                var request = (HttpWebRequest)WebRequest.Create(_mUri);

                response = request.GetResponse();
                stream = response.GetResponseStream();

                if (!response.ContentType.ToLower().StartsWith("text/"))
                {
                    SaveBinaryFile(response);
                    return null;
                }

                string buffer = "", line;

                if (stream != null) reader = new StreamReader(stream);

                if (reader != null)
                    while ((line = reader.ReadLine()) != null)
                    {
                        buffer += line + "\r\n";
                    }

                SaveTextFile(buffer);
                return buffer;
            }
            catch (WebException e)
            {
                Console.WriteLine("下载失败，错误：" + e);
                return null;
            }
            catch (IOException e)
            {
                Console.WriteLine("下载失败，错误：" + e);
                return null;
            }
            finally
            {
                if (reader != null) reader.Close();
                if (stream != null) stream.Close();
                if (response != null) response.Close();
            }
        }

        /// <summary>
        /// Process each link encountered. The link will be recorded
        /// for later spidering if it is an http or https docuent, 
        /// has not been visited before(determined by spider class),
        /// and is in the same host as the original base URL.
        /// </summary>
        /// <param name="link">The URL to process</param>
        private void ProcessLink(string link)
        {
            Uri url;

            // fully expand this URL if it was a relative link
            try
            {
                url = new Uri(_mUri, link, false);
            }
            catch (UriFormatException e)
            {
                Console.WriteLine("Invalid URI:" + link + " Error:" + e.Message);
                return;
            }

            if (!url.Scheme.ToLower().Equals("http") &&
                !url.Scheme.ToLower().Equals("https"))
                return;

            // comment out this line if you would like to spider
            // the whole Internet (yeah right, but it will try)
            if (!url.Host.ToLower().Equals(_mUri.Host.ToLower()))
                return;

            //System.Console.WriteLine( "Queue:"+url );
            _mSpider.AddUri(url);



        }

        /// <summary>
        /// Process a URL
        /// </summary>
        /// <param name="page">the URL to process</param>
        private void ProcessPage(string page)
        {
            var parse = new ParseHtml { Source = page };

            while (!parse.Eof())
            {
                char ch = parse.Parse();
                if (ch == 0)
                {
                    Attribute a = parse.GetTag()["HREF"];
                    if (a != null)
                        ProcessLink(a.Value);

                    a = parse.GetTag()["SRC"];
                    if (a != null)
                        ProcessLink(a.Value);
                }
            }
        }


        /// <summary>
        /// This method is the main loop for the spider threads.
        /// This method will wait for URL's to become available, 
        /// and then process them. 
        /// </summary>
        public void Process()
        {
            while (!_mSpider.Quit)
            {
                _mUri = _mSpider.ObtainWork();

                _mSpider.SpiderDone.WorkerBegin();
                Console.WriteLine("Download(" + Number + "):" + _mUri);
                string page = GetPage();
                if (page != null)
                    ProcessPage(page);
                _mSpider.SpiderDone.WorkerEnd();
            }
        }

        /// <summary>
        /// Start the thread.
        /// </summary>
        public void Start()
        {
            var ts = new ThreadStart(Process);
            _mThread = new Thread(ts);
            _mThread.Start();
        }

        /// <summary>
        /// The thread number. Used only to identify this thread.
        /// </summary>
        public int Number { get; set; }
    }
}
